package com.cmge.ad.spider.pic.gaoxiao;

import java.util.List;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import com.cmge.ad.model.Picture;
import com.cmge.ad.spider.pipeline.MysqlPicturePipeline;

/**
 * @desc	寸土吧搞笑图 图片抓取   已爬
 * 			不要开启多线程  该网站有安全狗会拦截请求
 * 			http://www.cuntuba.com/gaoxiao/list_117.html
 * 
 * @author	ljt
 * @time	2014-12-30 下午7:51:34
 */
public class CunTuBaGaoXiaoImagCrawl implements PageProcessor {
	
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public static final String URL_LIST = "/gaoxiao/list_\\w+";
    
    // 列表最大值
    private int max = 210;
    
    @Override
    public void process(Page page) {
    	if (page.getUrl().regex(URL_LIST).match()) {
    		// 检索当前页面所有段子
        	List<Selectable> cList = page.getHtml().xpath("//div[@class='leftcont']//dl").nodes();
    		if(null != cList && cList.size() > 0){
    			for(Selectable str : cList){
    				String url = str.xpath("//div[@class='cont']//a/@href").get().toString();
    				if(!StringUtils.isEmpty(url)){
    					page.addTargetRequest(url);
    				}
    			}
    		}
    		
    		// 当前页
    		String url = page.getUrl().get();
    		int current = 1;
    		try {
    			current = Integer.parseInt(url.substring(url.lastIndexOf("_")+1,url.lastIndexOf(".html")));
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		
    		System.out.println("current is "+current);
    		if(current < max){
    			page.addTargetRequest("http://www.cuntuba.com/gaoxiao/list_"+(current+1)+".html");
    		}
        } else {
        	Picture pic = new Picture();
//        	String desc = page.getHtml().xpath("//div[@class='cont']//img/@alt").get().toString();
//        	String url = page.getHtml().xpath("//div[@class='cont']//img/@src").get().toString();
        	String desc = page.getHtml().xpath("//dl[@class='listitem']//dt//h3/text()").get().toString();
        	String url = page.getHtml().xpath("//div[@class='leftcont']//div//img[@class='xia']/@src").get().toString();
        	if(!StringUtils.isEmpty(url)){
        		pic.setDesc(desc);
        		pic.setMinImageUrl(url);
        		pic.setMaxImageUrl(url);
        		pic.setSource("cuntuba_gaoxiaotu");
        		page.putField("picture",pic);
        	}
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) throws Exception {
    	Spider qsSpider = Spider.create(new CunTuBaGaoXiaoImagCrawl())
    					.addUrl("http://www.cuntuba.com/gaoxiao/list_1.html")
//    					.addPipeline(new RedisPipeline())
//    					.addPipeline(new JsonFilePipeline())
//    					.addPipeline(new JsonPipeline())
    					.addPipeline(new MysqlPicturePipeline())
    					.thread(1);
    	qsSpider.start();
    }	
	
	
}
